Original Data with Cross-Validation¶

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
import matplotlib.pyplot as plt

# Load the data
data_path = r"C:\Users\Owner\Desktop\GA_data\Original.csv"
df = pd.read_csv(data_path)
df.head()

# Assuming 'Join_count' is your dependent variable, and other columns are independent variables
X = df.drop('EVI', axis=1)  # Independent variables
y = df['EVI']  # Dependent variable

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=43)

# Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)

# Lasso Regression
lasso_model = Lasso(alpha=0.01)
lasso_model.fit(X_train, y_train)
y_pred_lasso = lasso_model.predict(X_test)

# Ridge Regression
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)

# Elastic Net Regression
elastic_net_model = ElasticNet(alpha=0.01, l1_ratio=0.5)
elastic_net_model.fit(X_train, y_train)
y_pred_elastic_net = elastic_net_model.predict(X_test)

# Support Vector Regression
svr_model = SVR(kernel='linear')
svr_model.fit(X_train, y_train)
y_pred_svr = svr_model.predict(X_test)

# Decision Tree Regression
tree_model = DecisionTreeRegressor()
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)

# Random Forest Regression
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)


# Evaluate the models
def evaluate_model(model, y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"R-squared: {r2}")

# Evaluate Linear Regression
print("\nLinear Regression:")
evaluate_model(linear_model, y_test, y_pred_linear)

# Evaluate Lasso Regression
print("\nLasso Regression:")
evaluate_model(lasso_model, y_test, y_pred_lasso)

# Evaluate Ridge Regression
print("\nRidge Regression:")
evaluate_model(ridge_model, y_test, y_pred_ridge)

# Evaluate Elastic Net Regression
print("\nElastic Net Regression:")
evaluate_model(elastic_net_model, y_test, y_pred_elastic_net)

# Evaluate Support Vector Regression
print("\nSupport Vector Regression:")
evaluate_model(svr_model, y_test, y_pred_svr)

# Evaluate Decision Tree Regression
print("\nDecision Tree Regression:")
evaluate_model(tree_model, y_test, y_pred_tree)

# Evaluate Random Forest Regression
print("\nRandom Forest Regression:")
evaluate_model(rf_model, y_test, y_pred_rf)

# Cross-Validation
def cross_validate(model, x_data, y_data, cv=5):
    y_pred = cross_val_predict(model, x_data, y_data, cv=cv)
    r2 = r2_score(y_data, y_pred)
    mse = mean_squared_error(y_data, y_pred)
    return y_pred, r2, mse

# Cross-Validation for each model
linear_cv_preds, linear_cv_r2, linear_cv_mse = cross_validate(linear_model, X_scaled, y)
lasso_cv_preds, lasso_cv_r2, lasso_cv_mse = cross_validate(lasso_model, X_scaled, y)
ridge_cv_preds, ridge_cv_r2, ridge_cv_mse = cross_validate(ridge_model, X_scaled, y)
elastic_net_cv_preds, elastic_net_cv_r2, elastic_net_cv_mse = cross_validate(elastic_net_model, X_scaled, y)
svr_cv_preds, svr_cv_r2, svr_cv_mse = cross_validate(svr_model, X_scaled, y)
tree_cv_preds, tree_cv_r2, tree_cv_mse = cross_validate(tree_model, X_scaled, y)
rf_cv_preds, rf_cv_r2, rf_cv_mse = cross_validate(rf_model, X_scaled, y)

# Display Cross-Validation Results
cv_results = pd.DataFrame({
    'Model': ['Linear Regression', 'Lasso Regression', 'Ridge Regression', 'Elastic Net Regression',
              'Support Vector Regression', 'Decision Tree Regression', 'Random Forest Regression'],
    'R-Squared': [linear_cv_r2, lasso_cv_r2, ridge_cv_r2, elastic_net_cv_r2,
                  svr_cv_r2, tree_cv_r2, rf_cv_r2],
    'Mean Squared Error': [linear_cv_mse, lasso_cv_mse, ridge_cv_mse, elastic_net_cv_mse,
                           svr_cv_mse, tree_cv_mse, rf_cv_mse]
})

print("\nCross-Validation Results:")
print(cv_results)

# Model Multicollinearity Test (VIF)
def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["Variable"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data

# Calculate VIF for independent variables
vif_results = calculate_vif(X)
print("\nVIF Results:")
print(vif_results)

# Plotting predictions vs actual values for each model
models = [linear_model, lasso_model, ridge_model, elastic_net_model, svr_model, tree_model, rf_model]
model_names = ['Linear Regression', 'Lasso Regression', 'Ridge Regression', 'Elastic Net Regression',
               'Support Vector Regression', 'Decision Tree Regression', 'Random Forest Regression']

plt.figure(figsize=(15, 10))

for model, name in zip(models, model_names):
    y_pred = model.predict(X_test)
    plt.scatter(y_test, y_pred, label=name)

plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--', linewidth=2)  # Diagonal line
plt.xlabel("Actual Enhanced Vegetation Index")
plt.ylabel("Predicted Vegetation Productivity")
plt.title("EVI vegetation productivity - All Regression Models on Original Data")
plt.legend()
plt.show()

# Plotting coefficients for Linear Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_linear = pd.Series(linear_model.coef_, index=X.columns)
ft_importances_lm_linear.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("Original Data Coefficients - Linear Regression")
plt.show()

# Plotting coefficients for Lasso Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_lasso = pd.Series(lasso_model.coef_, index=X.columns)
ft_importances_lm_lasso.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("Original Data Coefficients - Lasso Regression")
plt.show()

# Plotting coefficients for Ridge Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_ridge = pd.Series(ridge_model.coef_, index=X.columns)
ft_importances_lm_ridge.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("Original Data Coefficients - Ridge Regression")
plt.show()

# Plotting coefficients for Elastic Net Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_elastic_net = pd.Series(elastic_net_model.coef_, index=X.columns)
ft_importances_lm_elastic_net.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("Original Data Coefficients - Elastic Net Regression")
plt.show()

# Support Vector Regression does not provide coefficients in the same way as linear models

# Plotting feature importance for Decision Tree Regression
plt.figure(figsize=(15, 10))
ft_importances_tree = pd.Series(tree_model.feature_importances_, index=X.columns)
ft_importances_tree.plot(kind='barh')
plt.xlabel("Feature Importance")
plt.ylabel("Feature Variables")
plt.title("Original Data Feature Importance - Decision Tree Regression")
plt.show()

# Plotting feature importance for Random Forest Regression
plt.figure(figsize=(15, 10))
ft_importances_rf = pd.Series(rf_model.feature_importances_, index=X.columns)
ft_importances_rf.plot(kind='barh')
plt.xlabel("Feature Importance")
plt.ylabel("Feature Variables")
plt.title("Original Data Feature Importance - Random Forest Regression")
plt.show()
C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 8.721e+07, tolerance: 8.868e+04
  model = cd_fast.enet_coordinate_descent(
C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 8.782e+05, tolerance: 8.868e+04
  model = cd_fast.enet_coordinate_descent(
Linear Regression:
Mean Squared Error: 223331.15782518263
R-squared: 0.7964451599002667

Lasso Regression:
Mean Squared Error: 222595.76858534582
R-squared: 0.7971154292911737

Ridge Regression:
Mean Squared Error: 221551.15979959743
R-squared: 0.7980675363613221

Elastic Net Regression:
Mean Squared Error: 220207.85858986876
R-squared: 0.7992918861816286

Support Vector Regression:
Mean Squared Error: 374004.55071556923
R-squared: 0.6591141278322075

Decision Tree Regression:
Mean Squared Error: 209987.18842077037
R-squared: 0.808607500277945

Random Forest Regression:
Mean Squared Error: 106268.82049654558
R-squared: 0.903141447102987
C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.305e+07, tolerance: 9.828e+04
  model = cd_fast.enet_coordinate_descent(
C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.564e+07, tolerance: 9.972e+04
  model = cd_fast.enet_coordinate_descent(
C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.008e+08, tolerance: 1.064e+05
  model = cd_fast.enet_coordinate_descent(
C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.025e+07, tolerance: 9.586e+04
  model = cd_fast.enet_coordinate_descent(
C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.695e+07, tolerance: 1.100e+05
  model = cd_fast.enet_coordinate_descent(
C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.152e+05, tolerance: 1.064e+05
  model = cd_fast.enet_coordinate_descent(
C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.086e+05, tolerance: 1.100e+05
  model = cd_fast.enet_coordinate_descent(
Cross-Validation Results:
                       Model  R-Squared  Mean Squared Error
0          Linear Regression   0.326579       710199.371805
1           Lasso Regression   0.348073       687531.375665
2           Ridge Regression   0.393373       639757.531989
3     Elastic Net Regression   0.492821       534878.717769
4  Support Vector Regression   0.524563       501402.773271
5   Decision Tree Regression   0.594728       427405.415835
6   Random Forest Regression   0.793910       217345.728136

VIF Results:
    Variable          VIF
0       Psum    12.146660
1       Temp    42.639319
2         pp    84.841281
3        gdp   247.855007
4       gova    18.969250
5         aa    10.041142
6         gp     9.582982
7         ls     1.623550
8        fai    36.072238
9        lgr    45.481630
10      iofp     2.607295
11       loh     6.893733
12      rpop    19.704394
13       hmp    36.194699
14       hpb    33.042339
15       mht    58.641005
16       pte    14.939183
17      tcrv   120.437473
18    pcurei     1.962042
19       pio    32.749696
20       sio    33.612325
21       tio    60.228057
22    croppg     3.471574
23    forest     2.261841
24     grass    20.862482
25      sand     2.162086
26     shrub     1.526806
27     urban     1.715978
28     water     2.176167
29   wetland     1.823453
30     bus_p    20.604707
31   dustgdp  1285.329290
32    fosgdp     3.716668
33   inter_p    31.507092
34   invpa_p    14.234124
35  mobile_p    10.779286
36    noxgdp     4.492853
37    road_p    32.125470
38    sdxgdp  1282.351539
39     taxiv    33.683888
40     telev     8.832584
41    wateru     8.771482

EMD_Residual with Cross-Validation¶

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

# Load the data
data_path = r"C:\Users\Owner\Desktop\GA_data\EMD_Residual.csv"
df = pd.read_csv(data_path)
df.head()

# Assuming 'Join_count' is your dependent variable, and other columns are independent variables
X = df.drop('EVI', axis=1)  # Independent variables
y = df['EVI']  # Dependent variable

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Check for and handle infinite or NaN values in X_imputed
X_imputed = np.nan_to_num(X_imputed, nan=np.nan, posinf=np.nan, neginf=np.nan)

# Check for and handle infinite or NaN values in y
y = np.nan_to_num(y, nan=np.nan, posinf=np.nan, neginf=np.nan)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=43)

# Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)

# Lasso Regression
lasso_model = Lasso(alpha=0.01)
lasso_model.fit(X_train, y_train)
y_pred_lasso = lasso_model.predict(X_test)

# Ridge Regression
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)

# Elastic Net Regression
elastic_net_model = ElasticNet(alpha=0.01, l1_ratio=0.5)
elastic_net_model.fit(X_train, y_train)
y_pred_elastic_net = elastic_net_model.predict(X_test)

# Support Vector Regression
svr_model = SVR(kernel='linear')
svr_model.fit(X_train, y_train)
y_pred_svr = svr_model.predict(X_test)

# Decision Tree Regression
tree_model = DecisionTreeRegressor()
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)

# Random Forest Regression
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)


# Evaluate the models
def evaluate_model(model, y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"R-squared: {r2}")

# Evaluate Linear Regression
print("\nLinear Regression:")
evaluate_model(linear_model, y_test, y_pred_linear)

# Evaluate Lasso Regression
print("\nLasso Regression:")
evaluate_model(lasso_model, y_test, y_pred_lasso)

# Evaluate Ridge Regression
print("\nRidge Regression:")
evaluate_model(ridge_model, y_test, y_pred_ridge)

# Evaluate Elastic Net Regression
print("\nElastic Net Regression:")
evaluate_model(elastic_net_model, y_test, y_pred_elastic_net)

# Evaluate Support Vector Regression
print("\nSupport Vector Regression:")
evaluate_model(svr_model, y_test, y_pred_svr)

# Evaluate Decision Tree Regression
print("\nDecision Tree Regression:")
evaluate_model(tree_model, y_test, y_pred_tree)

# Evaluate Random Forest Regression
print("\nRandom Forest Regression:")
evaluate_model(rf_model, y_test, y_pred_rf)

# Cross-Validation
def cross_validate(model, x_data, y_data, cv=5):
    y_pred = cross_val_predict(model, x_data, y_data, cv=cv)
    r2 = r2_score(y_data, y_pred)
    mse = mean_squared_error(y_data, y_pred)
    return y_pred, r2, mse

# Cross-Validation for each model
linear_cv_preds, linear_cv_r2, linear_cv_mse = cross_validate(linear_model, X_scaled, y)
lasso_cv_preds, lasso_cv_r2, lasso_cv_mse = cross_validate(lasso_model, X_scaled, y)
ridge_cv_preds, ridge_cv_r2, ridge_cv_mse = cross_validate(ridge_model, X_scaled, y)
elastic_net_cv_preds, elastic_net_cv_r2, elastic_net_cv_mse = cross_validate(elastic_net_model, X_scaled, y)
svr_cv_preds, svr_cv_r2, svr_cv_mse = cross_validate(svr_model, X_scaled, y)
tree_cv_preds, tree_cv_r2, tree_cv_mse = cross_validate(tree_model, X_scaled, y)
rf_cv_preds, rf_cv_r2, rf_cv_mse = cross_validate(rf_model, X_scaled, y)

# Display Cross-Validation Results
cv_results = pd.DataFrame({
    'Model': ['Linear Regression', 'Lasso Regression', 'Ridge Regression', 'Elastic Net Regression',
              'Support Vector Regression', 'Decision Tree Regression', 'Random Forest Regression'],
    'R-Squared': [linear_cv_r2, lasso_cv_r2, ridge_cv_r2, elastic_net_cv_r2,
                  svr_cv_r2, tree_cv_r2, rf_cv_r2],
    'Mean Squared Error': [linear_cv_mse, lasso_cv_mse, ridge_cv_mse, elastic_net_cv_mse,
                           svr_cv_mse, tree_cv_mse, rf_cv_mse]
})

print("\nCross-Validation Results:")
print(cv_results)

# Model Multicollinearity Test (VIF)
def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["Variable"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data

# Calculate VIF for independent variables
vif_results = calculate_vif(X)
print("\nVIF Results:")
print(vif_results)

# Plotting predictions vs actual values for each model
models = [linear_model, lasso_model, ridge_model, elastic_net_model, svr_model, tree_model, rf_model]
model_names = ['Linear Regression', 'Lasso Regression', 'Ridge Regression', 'Elastic Net Regression',
               'Support Vector Regression', 'Decision Tree Regression', 'Random Forest Regression']

plt.figure(figsize=(15, 10))

for model, name in zip(models, model_names):
    y_pred = model.predict(X_test)
    plt.scatter(y_test, y_pred, label=name)

plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--', linewidth=2)  # Diagonal line
plt.xlabel("Actual Enhanced Vegetation Index")
plt.ylabel("Predicted Vegetation Productivity")
plt.title("EVI vegetation productivity - All Regression Models on EMD_Residual Data")
plt.legend()
plt.show()

# Plotting coefficients for Linear Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_linear = pd.Series(linear_model.coef_, index=X.columns)
ft_importances_lm_linear.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("EMD_Residual Data Coefficients - Linear Regression")
plt.show()

# Plotting coefficients for Lasso Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_lasso = pd.Series(lasso_model.coef_, index=X.columns)
ft_importances_lm_lasso.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("EMD_Residual Data Coefficients - Lasso Regression")
plt.show()

# Plotting coefficients for Ridge Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_ridge = pd.Series(ridge_model.coef_, index=X.columns)
ft_importances_lm_ridge.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("EMD_Residual Data Coefficients - Ridge Regression")
plt.show()

# Plotting coefficients for Elastic Net Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_elastic_net = pd.Series(elastic_net_model.coef_, index=X.columns)
ft_importances_lm_elastic_net.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("EMD_Residual Data Coefficients - Elastic Net Regression")
plt.show()

# Support Vector Regression does not provide coefficients in the same way as linear models

# Plotting feature importance for Decision Tree Regression
plt.figure(figsize=(15, 10))
ft_importances_tree = pd.Series(tree_model.feature_importances_, index=X.columns)
ft_importances_tree.plot(kind='barh')
plt.xlabel("Feature Importance")
plt.ylabel("Feature Variables")
plt.title("EMD_Residual Data Feature Importance - Decision Tree Regression")
plt.show()

# Plotting feature importance for Random Forest Regression
plt.figure(figsize=(15, 10))
ft_importances_rf = pd.Series(rf_model.feature_importances_, index=X.columns)
ft_importances_rf.plot(kind='barh')
plt.xlabel("Feature Importance")
plt.ylabel("Feature Variables")
plt.title("EMD_Residual Data Feature Importance - Random Forest Regression")
plt.show()
Linear Regression:
Mean Squared Error: 161589.5170867599
R-squared: 0.8306942125812589

Lasso Regression:
Mean Squared Error: 161593.50097144092
R-squared: 0.8306900384569402

Ridge Regression:
Mean Squared Error: 162331.6835140603
R-squared: 0.8299166060035847

Elastic Net Regression:
Mean Squared Error: 165031.10461440653
R-squared: 0.8270882813498048

Support Vector Regression:
Mean Squared Error: 298695.61362412694
R-squared: 0.6870409852391317

Decision Tree Regression:
Mean Squared Error: 3240.126584973943
R-squared: 0.9966051499336379

Random Forest Regression:
Mean Squared Error: 4885.857836475552
R-squared: 0.9948808312374844
C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.346e+05, tolerance: 8.838e+04
  model = cd_fast.enet_coordinate_descent(
Cross-Validation Results:
                       Model  R-Squared  Mean Squared Error
0          Linear Regression   0.001368       942591.458133
1           Lasso Regression   0.002814       941225.938013
2           Ridge Regression   0.024348       920900.635013
3     Elastic Net Regression   0.059235       887971.312044
4  Support Vector Regression   0.325167       636963.229115
5   Decision Tree Regression   0.362003       602193.925694
6   Random Forest Regression   0.710667       273095.921391

VIF Results:
    Variable        VIF
0       Psum  46.201812
1       Temp  78.081874
2         pp  14.896199
3        gdp  15.015930
4       gova   5.786582
5         aa   5.477537
6         gp   2.874869
7         ls   8.281318
8        fai   9.254266
9        lgr  11.599613
10      iofp   6.333291
11       loh   8.323424
12      rpop   9.204349
13       hmp  11.487775
14       hpb  50.590835
15       mht  61.867141
16       pte  11.457694
17      tcrv  89.898989
18    pcurei   2.675602
19       pio  11.322218
20       sio  13.748914
21       tio  70.182718
22    croppg   3.269623
23    forest   2.350379
24     grass  23.945826
25      sand   2.530238
26     shrub   1.951590
27     urban   1.312915
28     water   2.006455
29   wetland   1.220154
30     bus_p  15.760790
31   dustgdp  29.858260
32    fosgdp   4.169074
33   inter_p  40.202206
34   invpa_p  16.281991
35  mobile_p   7.781903
36    noxgdp  16.342599
37    road_p  42.365681
38    sdxgdp  47.148103
39     taxiv  82.841972
40     telev  17.652349
41    wateru  10.769069

SVR_Residual with Cross-Validation¶

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

# Load the data
data_path = r"C:\Users\Owner\Desktop\GA_data\SVR_Res.csv"
df = pd.read_csv(data_path)
df.head()

# Assuming 'Join_count' is your dependent variable, and other columns are independent variables
X = df.drop('EVI', axis=1)  # Independent variables
y = df['EVI']  # Dependent variable

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Check for and handle infinite or NaN values in X_imputed
X_imputed = np.nan_to_num(X_imputed, nan=np.nan, posinf=np.nan, neginf=np.nan)

# Check for and handle infinite or NaN values in y
y = np.nan_to_num(y, nan=np.nan, posinf=np.nan, neginf=np.nan)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=43)

# Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)

# Lasso Regression
lasso_model = Lasso(alpha=0.01)
lasso_model.fit(X_train, y_train)
y_pred_lasso = lasso_model.predict(X_test)

# Ridge Regression
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)

# Elastic Net Regression
elastic_net_model = ElasticNet(alpha=0.01, l1_ratio=0.5)
elastic_net_model.fit(X_train, y_train)
y_pred_elastic_net = elastic_net_model.predict(X_test)

# Support Vector Regression
svr_model = SVR(kernel='linear')
svr_model.fit(X_train, y_train)
y_pred_svr = svr_model.predict(X_test)

# Decision Tree Regression
tree_model = DecisionTreeRegressor()
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)

# Random Forest Regression
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)


# Evaluate the models
def evaluate_model(model, y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"R-squared: {r2}")

# Evaluate Linear Regression
print("\nLinear Regression:")
evaluate_model(linear_model, y_test, y_pred_linear)

# Evaluate Lasso Regression
print("\nLasso Regression:")
evaluate_model(lasso_model, y_test, y_pred_lasso)

# Evaluate Ridge Regression
print("\nRidge Regression:")
evaluate_model(ridge_model, y_test, y_pred_ridge)

# Evaluate Elastic Net Regression
print("\nElastic Net Regression:")
evaluate_model(elastic_net_model, y_test, y_pred_elastic_net)

# Evaluate Support Vector Regression
print("\nSupport Vector Regression:")
evaluate_model(svr_model, y_test, y_pred_svr)

# Evaluate Decision Tree Regression
print("\nDecision Tree Regression:")
evaluate_model(tree_model, y_test, y_pred_tree)

# Evaluate Random Forest Regression
print("\nRandom Forest Regression:")
evaluate_model(rf_model, y_test, y_pred_rf)

# Cross-Validation
def cross_validate(model, x_data, y_data, cv=5):
    y_pred = cross_val_predict(model, x_data, y_data, cv=cv)
    r2 = r2_score(y_data, y_pred)
    mse = mean_squared_error(y_data, y_pred)
    return y_pred, r2, mse

# Cross-Validation for each model
linear_cv_preds, linear_cv_r2, linear_cv_mse = cross_validate(linear_model, X_scaled, y)
lasso_cv_preds, lasso_cv_r2, lasso_cv_mse = cross_validate(lasso_model, X_scaled, y)
ridge_cv_preds, ridge_cv_r2, ridge_cv_mse = cross_validate(ridge_model, X_scaled, y)
elastic_net_cv_preds, elastic_net_cv_r2, elastic_net_cv_mse = cross_validate(elastic_net_model, X_scaled, y)
svr_cv_preds, svr_cv_r2, svr_cv_mse = cross_validate(svr_model, X_scaled, y)
tree_cv_preds, tree_cv_r2, tree_cv_mse = cross_validate(tree_model, X_scaled, y)
rf_cv_preds, rf_cv_r2, rf_cv_mse = cross_validate(rf_model, X_scaled, y)

# Display Cross-Validation Results
cv_results = pd.DataFrame({
    'Model': ['Linear Regression', 'Lasso Regression', 'Ridge Regression', 'Elastic Net Regression',
              'Support Vector Regression', 'Decision Tree Regression', 'Random Forest Regression'],
    'R-Squared': [linear_cv_r2, lasso_cv_r2, ridge_cv_r2, elastic_net_cv_r2,
                  svr_cv_r2, tree_cv_r2, rf_cv_r2],
    'Mean Squared Error': [linear_cv_mse, lasso_cv_mse, ridge_cv_mse, elastic_net_cv_mse,
                           svr_cv_mse, tree_cv_mse, rf_cv_mse]
})

print("\nCross-Validation Results:")
print(cv_results)

# Model Multicollinearity Test (VIF)
def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["Variable"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data

# Calculate VIF for independent variables
vif_results = calculate_vif(X)
print("\nVIF Results:")
print(vif_results)

# Plotting predictions vs actual values for each model
models = [linear_model, lasso_model, ridge_model, elastic_net_model, svr_model, tree_model, rf_model]
model_names = ['Linear Regression', 'Lasso Regression', 'Ridge Regression', 'Elastic Net Regression',
               'Support Vector Regression', 'Decision Tree Regression', 'Random Forest Regression']

plt.figure(figsize=(15, 10))

for model, name in zip(models, model_names):
    y_pred = model.predict(X_test)
    plt.scatter(y_test, y_pred, label=name)

plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--', linewidth=2)  # Diagonal line
plt.xlabel("Actual Enhanced Vegetation Index")
plt.ylabel("Predicted Vegetation Productivity")
plt.title("EVI vegetation productivity - All Regression Models on SVR_Residual Data")
plt.legend()
plt.show()

# Plotting coefficients for Linear Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_linear = pd.Series(linear_model.coef_, index=X.columns)
ft_importances_lm_linear.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("SVR_Residual Data Coefficients - Linear Regression")
plt.show()

# Plotting coefficients for Lasso Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_lasso = pd.Series(lasso_model.coef_, index=X.columns)
ft_importances_lm_lasso.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("SVR_Residual Data Coefficients - Lasso Regression")
plt.show()

# Plotting coefficients for Ridge Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_ridge = pd.Series(ridge_model.coef_, index=X.columns)
ft_importances_lm_ridge.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("SVR_Residual Data Coefficients - Ridge Regression")
plt.show()

# Plotting coefficients for Elastic Net Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_elastic_net = pd.Series(elastic_net_model.coef_, index=X.columns)
ft_importances_lm_elastic_net.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("SVR_Residual Data Coefficients - Elastic Net Regression")
plt.show()

# Support Vector Regression does not provide coefficients in the same way as linear models

# Plotting feature importance for Decision Tree Regression
plt.figure(figsize=(15, 10))
ft_importances_tree = pd.Series(tree_model.feature_importances_, index=X.columns)
ft_importances_tree.plot(kind='barh')
plt.xlabel("Feature Importance")
plt.ylabel("Feature Variables")
plt.title("SVR_Residual Data Feature Importance - Decision Tree Regression")
plt.show()

# Plotting feature importance for Random Forest Regression
plt.figure(figsize=(15, 10))
ft_importances_rf = pd.Series(rf_model.feature_importances_, index=X.columns)
ft_importances_rf.plot(kind='barh')
plt.xlabel("Feature Importance")
plt.ylabel("Feature Variables")
plt.title("SVR_Residual Data Feature Importance - Random Forest Regression")
plt.show()
Linear Regression:
Mean Squared Error: 191029.2347223954
R-squared: 0.799848680856509

Lasso Regression:
Mean Squared Error: 191034.26939142178
R-squared: 0.7998434057704806

Ridge Regression:
Mean Squared Error: 191142.80414797275
R-squared: 0.7997296882301881

Elastic Net Regression:
Mean Squared Error: 191608.8948863569
R-squared: 0.7992413405892435

Support Vector Regression:
Mean Squared Error: 282410.0030080256
R-squared: 0.7041042711419767

Decision Tree Regression:
Mean Squared Error: 2026.0063391507622
R-squared: 0.9978772472079293

Random Forest Regression:
Mean Squared Error: 1481.713775843146
R-squared: 0.9984475309904316

Cross-Validation Results:
                       Model     R-Squared  Mean Squared Error
0          Linear Regression -1.342138e+15        1.266821e+21
1           Lasso Regression -1.638980e+01        1.641392e+07
2           Ridge Regression -7.271596e+00        7.807413e+06
3     Elastic Net Regression -2.039070e+00        2.868524e+06
4  Support Vector Regression  3.534777e-01        6.102409e+05
5   Decision Tree Regression -3.573694e-01        1.281197e+06
6   Random Forest Regression  2.011485e-01        7.540217e+05
C:\Users\Owner\anaconda3\envs\arcpro39\lib\site-packages\statsmodels\regression\linear_model.py:1783: RuntimeWarning: invalid value encountered in scalar divide
  return 1 - self.ssr/self.uncentered_tss
VIF Results:
    Variable         VIF
0       Psum   60.604263
1       Temp  114.843237
2         pp   19.194119
3        gdp   11.391848
4       gova   13.290764
5         aa    5.691484
6         gp    2.829718
7         ls    9.252364
8        fai   13.141497
9        lgr   14.539192
10      iofp   25.967739
11       loh   17.505314
12      rpop   11.057719
13       hmp   13.630785
14       hpb    6.587827
15       mht   31.696345
16       pte   26.365839
17      tcrv         NaN
18    pcurei    4.561716
19       pio   29.720630
20       sio    9.738110
21       tio         NaN
22    croppg    2.516127
23    forest    2.113349
24     grass   18.374692
25      sand    3.403265
26     shrub    2.102264
27     urban    6.026541
28     water    2.911562
29   wetland    2.310738
30     bus_p   10.432935
31   dustgdp   22.514362
32    fosgdp   11.218869
33   inter_p    4.670754
34   invpa_p   13.128986
35  mobile_p    6.275998
36    noxgdp   14.991900
37    road_p   38.888496
38    sdxgdp   28.007735
39     taxiv    3.264132
40     telev   14.209921
41    wateru   14.883773

SVR_Aggregate with Cross-Validation¶

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

# Load the data
data_path = r"C:\Users\Owner\Desktop\GA_data\SVR_Agg.csv"
df = pd.read_csv(data_path)
df.head()

# Assuming 'Join_count' is your dependent variable, and other columns are independent variables
X = df.drop('EVI', axis=1)  # Independent variables
y = df['EVI']  # Dependent variable

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Check for and handle infinite or NaN values in X_imputed
X_imputed = np.nan_to_num(X_imputed, nan=np.nan, posinf=np.nan, neginf=np.nan)

# Check for and handle infinite or NaN values in y
y = np.nan_to_num(y, nan=np.nan, posinf=np.nan, neginf=np.nan)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=43)

# Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)

# Lasso Regression
lasso_model = Lasso(alpha=0.01)
lasso_model.fit(X_train, y_train)
y_pred_lasso = lasso_model.predict(X_test)

# Ridge Regression
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)

# Elastic Net Regression
elastic_net_model = ElasticNet(alpha=0.01, l1_ratio=0.5)
elastic_net_model.fit(X_train, y_train)
y_pred_elastic_net = elastic_net_model.predict(X_test)

# Support Vector Regression
svr_model = SVR(kernel='linear')
svr_model.fit(X_train, y_train)
y_pred_svr = svr_model.predict(X_test)

# Decision Tree Regression
tree_model = DecisionTreeRegressor()
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)

# Random Forest Regression
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)


# Evaluate the models
def evaluate_model(model, y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"R-squared: {r2}")

# Evaluate Linear Regression
print("\nLinear Regression:")
evaluate_model(linear_model, y_test, y_pred_linear)

# Evaluate Lasso Regression
print("\nLasso Regression:")
evaluate_model(lasso_model, y_test, y_pred_lasso)

# Evaluate Ridge Regression
print("\nRidge Regression:")
evaluate_model(ridge_model, y_test, y_pred_ridge)

# Evaluate Elastic Net Regression
print("\nElastic Net Regression:")
evaluate_model(elastic_net_model, y_test, y_pred_elastic_net)

# Evaluate Support Vector Regression
print("\nSupport Vector Regression:")
evaluate_model(svr_model, y_test, y_pred_svr)

# Evaluate Decision Tree Regression
print("\nDecision Tree Regression:")
evaluate_model(tree_model, y_test, y_pred_tree)

# Evaluate Random Forest Regression
print("\nRandom Forest Regression:")
evaluate_model(rf_model, y_test, y_pred_rf)

# Cross-Validation
def cross_validate(model, x_data, y_data, cv=5):
    y_pred = cross_val_predict(model, x_data, y_data, cv=cv)
    r2 = r2_score(y_data, y_pred)
    mse = mean_squared_error(y_data, y_pred)
    return y_pred, r2, mse

# Cross-Validation for each model
linear_cv_preds, linear_cv_r2, linear_cv_mse = cross_validate(linear_model, X_scaled, y)
lasso_cv_preds, lasso_cv_r2, lasso_cv_mse = cross_validate(lasso_model, X_scaled, y)
ridge_cv_preds, ridge_cv_r2, ridge_cv_mse = cross_validate(ridge_model, X_scaled, y)
elastic_net_cv_preds, elastic_net_cv_r2, elastic_net_cv_mse = cross_validate(elastic_net_model, X_scaled, y)
svr_cv_preds, svr_cv_r2, svr_cv_mse = cross_validate(svr_model, X_scaled, y)
tree_cv_preds, tree_cv_r2, tree_cv_mse = cross_validate(tree_model, X_scaled, y)
rf_cv_preds, rf_cv_r2, rf_cv_mse = cross_validate(rf_model, X_scaled, y)

# Display Cross-Validation Results
cv_results = pd.DataFrame({
    'Model': ['Linear Regression', 'Lasso Regression', 'Ridge Regression', 'Elastic Net Regression',
              'Support Vector Regression', 'Decision Tree Regression', 'Random Forest Regression'],
    'R-Squared': [linear_cv_r2, lasso_cv_r2, ridge_cv_r2, elastic_net_cv_r2,
                  svr_cv_r2, tree_cv_r2, rf_cv_r2],
    'Mean Squared Error': [linear_cv_mse, lasso_cv_mse, ridge_cv_mse, elastic_net_cv_mse,
                           svr_cv_mse, tree_cv_mse, rf_cv_mse]
})

print("\nCross-Validation Results:")
print(cv_results)

# Model Multicollinearity Test (VIF)
def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["Variable"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data

# Calculate VIF for independent variables
vif_results = calculate_vif(X)
print("\nVIF Results:")
print(vif_results)

# Plotting predictions vs actual values for each model
models = [linear_model, lasso_model, ridge_model, elastic_net_model, svr_model, tree_model, rf_model]
model_names = ['Linear Regression', 'Lasso Regression', 'Ridge Regression', 'Elastic Net Regression',
               'Support Vector Regression', 'Decision Tree Regression', 'Random Forest Regression']

plt.figure(figsize=(15, 10))

for model, name in zip(models, model_names):
    y_pred = model.predict(X_test)
    plt.scatter(y_test, y_pred, label=name)

plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--', linewidth=2)  # Diagonal line
plt.xlabel("Actual Enhanced Vegetation Index")
plt.ylabel("Predicted Vegetation Productivity")
plt.title("EVI vegetation productivity - All Regression Models on SVR_Aggregate Data")
plt.legend()
plt.show()

# Plotting coefficients for Linear Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_linear = pd.Series(linear_model.coef_, index=X.columns)
ft_importances_lm_linear.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("SVR_Aggregate Data Coefficients - Linear Regression")
plt.show()

# Plotting coefficients for Lasso Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_lasso = pd.Series(lasso_model.coef_, index=X.columns)
ft_importances_lm_lasso.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("SVR_Aggregate Data Coefficients - Lasso Regression")
plt.show()

# Plotting coefficients for Ridge Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_ridge = pd.Series(ridge_model.coef_, index=X.columns)
ft_importances_lm_ridge.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("SVR_Aggregate Data Coefficients - Ridge Regression")
plt.show()

# Plotting coefficients for Elastic Net Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_elastic_net = pd.Series(elastic_net_model.coef_, index=X.columns)
ft_importances_lm_elastic_net.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("SVR_Aggregate Data Coefficients - Elastic Net Regression")
plt.show()

# Support Vector Regression does not provide coefficients in the same way as linear models

# Plotting feature importance for Decision Tree Regression
plt.figure(figsize=(15, 10))
ft_importances_tree = pd.Series(tree_model.feature_importances_, index=X.columns)
ft_importances_tree.plot(kind='barh')
plt.xlabel("Feature Importance")
plt.ylabel("Feature Variables")
plt.title("SVR_Aggregate Data Feature Importance - Decision Tree Regression")
plt.show()

# Plotting feature importance for Random Forest Regression
plt.figure(figsize=(15, 10))
ft_importances_rf = pd.Series(rf_model.feature_importances_, index=X.columns)
ft_importances_rf.plot(kind='barh')
plt.xlabel("Feature Importance")
plt.ylabel("Feature Variables")
plt.title("SVR_Aggregate Data Feature Importance - Random Forest Regression")
plt.show()
Linear Regression:
Mean Squared Error: 248917.07236289946
R-squared: 0.7678656394878618

Lasso Regression:
Mean Squared Error: 248933.91214058246
R-squared: 0.7678499350968913

Ridge Regression:
Mean Squared Error: 248714.65693129468
R-squared: 0.7680544074832721

Elastic Net Regression:
Mean Squared Error: 248460.46874279427
R-squared: 0.768291457565965

Support Vector Regression:
Mean Squared Error: 368410.78910387954
R-squared: 0.6564285361282114

Decision Tree Regression:
Mean Squared Error: 120936.7605888516
R-squared: 0.8872171469991669

Random Forest Regression:
Mean Squared Error: 69221.83308766509
R-squared: 0.9354453039128781
C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.909e+05, tolerance: 9.482e+04
  model = cd_fast.enet_coordinate_descent(
C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.861e+05, tolerance: 9.569e+04
  model = cd_fast.enet_coordinate_descent(
Cross-Validation Results:
                       Model  R-Squared  Mean Squared Error
0          Linear Regression  -1.406768        2.456676e+06
1           Lasso Regression  -1.387017        2.436516e+06
2           Ridge Regression  -0.551109        1.583274e+06
3     Elastic Net Regression   0.086989        9.319438e+05
4  Support Vector Regression   0.444086        5.674422e+05
5   Decision Tree Regression   0.543226        4.662462e+05
6   Random Forest Regression   0.697915        3.083493e+05

VIF Results:
    Variable         VIF
0       Psum   21.349290
1       Temp   64.008951
2         pp  102.315860
3        gdp   24.284434
4       gova   21.969581
5         aa   11.697837
6         gp    7.883940
7         ls    2.598890
8        fai   12.097485
9        lgr   21.400360
10      iofp    3.015758
11       loh    6.784207
12      rpop   23.224089
13       hmp   42.382273
14       hpb   36.258598
15       mht   77.773037
16       pte   21.528149
17      tcrv   58.645647
18    pcurei    2.776087
19       pio   33.625959
20       sio   27.232431
21       tio   51.229273
22    croppg    2.817496
23    forest    1.698973
24     grass   26.368864
25      sand    2.378605
26     shrub    1.474054
27     urban    1.186559
28     water    1.302964
29   wetland    1.274774
30     bus_p   14.514227
31   dustgdp    1.483317
32    fosgdp    4.335904
33   inter_p   14.804122
34   invpa_p   13.934623
35  mobile_p    4.929697
36    noxgdp   16.101705
37    road_p   34.033373
38    sdxgdp    3.073364
39     taxiv   65.064443
40     telev    9.360795
41    wateru   14.420194

TheiSenSlopes with Cross-Validation¶

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

# Load the data
data_path = r"C:\Users\Owner\Desktop\GA_data\TheiSenSlopes.csv"
df = pd.read_csv(data_path)
df.head()

# Assuming 'Join_count' is your dependent variable, and other columns are independent variables
X = df.drop('EVI', axis=1)  # Independent variables
y = df['EVI']  # Dependent variable

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Check for and handle infinite or NaN values in X_imputed
X_imputed = np.nan_to_num(X_imputed, nan=np.nan, posinf=np.nan, neginf=np.nan)

# Check for and handle infinite or NaN values in y
y = np.nan_to_num(y, nan=np.nan, posinf=np.nan, neginf=np.nan)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=43)

# Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)

# Lasso Regression
lasso_model = Lasso(alpha=0.01)
lasso_model.fit(X_train, y_train)
y_pred_lasso = lasso_model.predict(X_test)

# Ridge Regression
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)

# Elastic Net Regression
elastic_net_model = ElasticNet(alpha=0.01, l1_ratio=0.5)
elastic_net_model.fit(X_train, y_train)
y_pred_elastic_net = elastic_net_model.predict(X_test)

# Support Vector Regression
svr_model = SVR(kernel='linear')
svr_model.fit(X_train, y_train)
y_pred_svr = svr_model.predict(X_test)

# Decision Tree Regression
tree_model = DecisionTreeRegressor()
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)

# Random Forest Regression
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)


# Evaluate the models
def evaluate_model(model, y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"R-squared: {r2}")

# Evaluate Linear Regression
print("\nLinear Regression:")
evaluate_model(linear_model, y_test, y_pred_linear)

# Evaluate Lasso Regression
print("\nLasso Regression:")
evaluate_model(lasso_model, y_test, y_pred_lasso)

# Evaluate Ridge Regression
print("\nRidge Regression:")
evaluate_model(ridge_model, y_test, y_pred_ridge)

# Evaluate Elastic Net Regression
print("\nElastic Net Regression:")
evaluate_model(elastic_net_model, y_test, y_pred_elastic_net)

# Evaluate Support Vector Regression
print("\nSupport Vector Regression:")
evaluate_model(svr_model, y_test, y_pred_svr)

# Evaluate Decision Tree Regression
print("\nDecision Tree Regression:")
evaluate_model(tree_model, y_test, y_pred_tree)

# Evaluate Random Forest Regression
print("\nRandom Forest Regression:")
evaluate_model(rf_model, y_test, y_pred_rf)

# Cross-Validation
def cross_validate(model, x_data, y_data, cv=5):
    y_pred = cross_val_predict(model, x_data, y_data, cv=cv)
    r2 = r2_score(y_data, y_pred)
    mse = mean_squared_error(y_data, y_pred)
    return y_pred, r2, mse

# Cross-Validation for each model
linear_cv_preds, linear_cv_r2, linear_cv_mse = cross_validate(linear_model, X_scaled, y)
lasso_cv_preds, lasso_cv_r2, lasso_cv_mse = cross_validate(lasso_model, X_scaled, y)
ridge_cv_preds, ridge_cv_r2, ridge_cv_mse = cross_validate(ridge_model, X_scaled, y)
elastic_net_cv_preds, elastic_net_cv_r2, elastic_net_cv_mse = cross_validate(elastic_net_model, X_scaled, y)
svr_cv_preds, svr_cv_r2, svr_cv_mse = cross_validate(svr_model, X_scaled, y)
tree_cv_preds, tree_cv_r2, tree_cv_mse = cross_validate(tree_model, X_scaled, y)
rf_cv_preds, rf_cv_r2, rf_cv_mse = cross_validate(rf_model, X_scaled, y)

# Display Cross-Validation Results
cv_results = pd.DataFrame({
    'Model': ['Linear Regression', 'Lasso Regression', 'Ridge Regression', 'Elastic Net Regression',
              'Support Vector Regression', 'Decision Tree Regression', 'Random Forest Regression'],
    'R-Squared': [linear_cv_r2, lasso_cv_r2, ridge_cv_r2, elastic_net_cv_r2,
                  svr_cv_r2, tree_cv_r2, rf_cv_r2],
    'Mean Squared Error': [linear_cv_mse, lasso_cv_mse, ridge_cv_mse, elastic_net_cv_mse,
                           svr_cv_mse, tree_cv_mse, rf_cv_mse]
})

print("\nCross-Validation Results:")
print(cv_results)

# Model Multicollinearity Test (VIF)
def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["Variable"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data

# Calculate VIF for independent variables
vif_results = calculate_vif(X)
print("\nVIF Results:")
print(vif_results)

# Plotting predictions vs actual values for each model
models = [linear_model, lasso_model, ridge_model, elastic_net_model, svr_model, tree_model, rf_model]
model_names = ['Linear Regression', 'Lasso Regression', 'Ridge Regression', 'Elastic Net Regression',
               'Support Vector Regression', 'Decision Tree Regression', 'Random Forest Regression']

plt.figure(figsize=(15, 10))

for model, name in zip(models, model_names):
    y_pred = model.predict(X_test)
    plt.scatter(y_test, y_pred, label=name)

plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--', linewidth=2)  # Diagonal line
plt.xlabel("Actual Enhanced Vegetation Index")
plt.ylabel("Predicted Vegetation Productivity")
plt.title("EVI vegetation productivity - All Regression Models on TheiSenSlopes Data")
plt.legend()
plt.show()

# Plotting coefficients for Linear Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_linear = pd.Series(linear_model.coef_, index=X.columns)
ft_importances_lm_linear.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("TheiSenSlopes Data Coefficients - Linear Regression")
plt.show()

# Plotting coefficients for Lasso Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_lasso = pd.Series(lasso_model.coef_, index=X.columns)
ft_importances_lm_lasso.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("TheiSenSlopes Data Coefficients - Lasso Regression")
plt.show()

# Plotting coefficients for Ridge Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_ridge = pd.Series(ridge_model.coef_, index=X.columns)
ft_importances_lm_ridge.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("TheiSenSlopes Data Coefficients - Ridge Regression")
plt.show()

# Plotting coefficients for Elastic Net Regression
plt.figure(figsize=(15, 10))
ft_importances_lm_elastic_net = pd.Series(elastic_net_model.coef_, index=X.columns)
ft_importances_lm_elastic_net.plot(kind='barh')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature Variables")
plt.title("TheiSenSlopes Data Coefficients - Elastic Net Regression")
plt.show()

# Support Vector Regression does not provide coefficients in the same way as linear models

# Plotting feature importance for Decision Tree Regression
plt.figure(figsize=(15, 10))
ft_importances_tree = pd.Series(tree_model.feature_importances_, index=X.columns)
ft_importances_tree.plot(kind='barh')
plt.xlabel("Feature Importance")
plt.ylabel("Feature Variables")
plt.title("TheiSenSlopes Data Feature Importance - Decision Tree Regression")
plt.show()

# Plotting feature importance for Random Forest Regression
plt.figure(figsize=(15, 10))
ft_importances_rf = pd.Series(rf_model.feature_importances_, index=X.columns)
ft_importances_rf.plot(kind='barh')
plt.xlabel("Feature Importance")
plt.ylabel("Feature Variables")
plt.title("TheiSenSlopes Data Feature Importance - Random Forest Regression")
plt.show()
C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.813e+07, tolerance: 8.208e+04
  model = cd_fast.enet_coordinate_descent(
C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 4.089e+07, tolerance: 8.208e+04
  model = cd_fast.enet_coordinate_descent(
Linear Regression:
Mean Squared Error: 85477.52029500603
R-squared: 0.912689865591831

Lasso Regression:
Mean Squared Error: 86436.38893434322
R-squared: 0.9117104390772178

Ridge Regression:
Mean Squared Error: 84944.32132575687
R-squared: 0.9132344961977583

Elastic Net Regression:
Mean Squared Error: 87490.01617333169
R-squared: 0.9106342223650963

Support Vector Regression:
Mean Squared Error: 211958.5823537373
R-squared: 0.7834970849598806

Decision Tree Regression:
Mean Squared Error: 7503.955761272335
R-squared: 0.9923351615272826

Random Forest Regression:
Mean Squared Error: 16989.891045389293
R-squared: 0.9826458504454328
C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.897e+07, tolerance: 8.777e+04
  model = cd_fast.enet_coordinate_descent(
C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.418e+07, tolerance: 8.954e+04
  model = cd_fast.enet_coordinate_descent(
C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.802e+07, tolerance: 9.770e+04
  model = cd_fast.enet_coordinate_descent(
C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.694e+07, tolerance: 8.864e+04
  model = cd_fast.enet_coordinate_descent(
C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.989e+07, tolerance: 1.028e+05
  model = cd_fast.enet_coordinate_descent(
C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.684e+07, tolerance: 8.777e+04
  model = cd_fast.enet_coordinate_descent(
C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.086e+07, tolerance: 8.954e+04
  model = cd_fast.enet_coordinate_descent(
C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.848e+07, tolerance: 9.770e+04
  model = cd_fast.enet_coordinate_descent(
C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 4.365e+07, tolerance: 8.864e+04
  model = cd_fast.enet_coordinate_descent(
C:\Users\Owner\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 4.738e+07, tolerance: 1.028e+05
  model = cd_fast.enet_coordinate_descent(
Cross-Validation Results:
                       Model  R-Squared  Mean Squared Error
0          Linear Regression  -2.559135        3.434514e+06
1           Lasso Regression  -1.955554        2.852066e+06
2           Ridge Regression  -0.508402        1.455585e+06
3     Elastic Net Regression   0.154341        8.160482e+05
4  Support Vector Regression   0.586065        3.994415e+05
5   Decision Tree Regression   0.577707        4.075065e+05
6   Random Forest Regression   0.752843        2.385031e+05

VIF Results:
    Variable          VIF
0       Psum    95.325265
1       Temp   138.755380
2         pp   313.749531
3        gdp   538.821338
4       gova   354.097648
5         aa    16.568545
6         gp    15.482781
7         ls    14.244668
8        fai   185.895653
9        lgr   199.701678
10      iofp     4.230339
11       loh    10.225547
12      rpop    71.771812
13       hmp    64.460563
14       hpb   193.395111
15       mht   164.132824
16       pte    53.759338
17      tcrv   446.530381
18    pcurei    88.105709
19       pio   343.920908
20       sio   109.362204
21       tio   205.115146
22    croppg     7.110237
23    forest    15.110842
24     grass    48.239407
25      sand     3.094659
26     shrub    12.424985
27     urban     3.272255
28     water     2.649976
29   wetland     1.809359
30     bus_p   403.303588
31   dustgdp  2970.874935
32    fosgdp    17.101441
33   inter_p   167.704628
34   invpa_p    39.852756
35  mobile_p   288.768518
36    noxgdp   532.880801
37    road_p   133.143666
38    sdxgdp  2080.111808
39     taxiv   256.607063
40     telev   340.640595
41    wateru    89.470541
In [ ]: